import urllib.request as ur
import lxml.etree as le
import user_agent


# 获取url地址
def getRequest(url):
    return ur.Request(
        url=url,
        headers={
            'User-Agent':user_agent.get_user_agent_pc(),
            #'Cookie':1231331323213,
        }
    )

#代理IP登录
def getProxyOpener():
    proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html').read().decode('utf-8').strip()
    proxy_handler = ur.ProxyHandler(
        {
            'http':proxy_address
        }
    )
    return ur.build_opener(proxy_handler)


# 博客地址
request = getRequest(
        'https://blog.csdn.net'
    )
try:
    response = getProxyOpener().open(request).read()
    href_s = le.HTML(response).xpath('//dd[@class="read_num"]/a/@href')
    for href in href_s:
        try:
            #获取热门博客地址
            response_blog = getProxyOpener().open(
                getRequest(href)
            ).read()
            #获取标题
            title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
            #去特殊符号
            title = re.sub('[\/:*?"<>|]', '-', title)
            print(title)
            with open('blog/%s.html' % title,'wb') as f:
                f.write(response_blog)
        except:
            print('抓取错误')
except:
    pass



